/**
 * This code is released under the
 * Apache License Version 2.0 http://www.apache.org/licenses/.
 *
 * (c) Daniel Lemire, http://lemire.me/en/
 */

#pragma once
#include "bitpacking.h"

#include <stdexcept>

namespace pollux::fastpforlib {

namespace internal {

// Note that this only packs 8 values
inline void fastunpack_quarter(
    const uint8_t* __restrict in,
    uint8_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastunpack0(in, out);
      break;
    case 1:
      internal::__fastunpack1(in, out);
      break;
    case 2:
      internal::__fastunpack2(in, out);
      break;
    case 3:
      internal::__fastunpack3(in, out);
      break;
    case 4:
      internal::__fastunpack4(in, out);
      break;
    case 5:
      internal::__fastunpack5(in, out);
      break;
    case 6:
      internal::__fastunpack6(in, out);
      break;
    case 7:
      internal::__fastunpack7(in, out);
      break;
    case 8:
      internal::__fastunpack8(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

// Note that this only packs 8 values
inline void fastpack_quarter(
    const uint8_t* __restrict in,
    uint8_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastpack0(in, out);
      break;
    case 1:
      internal::__fastpack1(in, out);
      break;
    case 2:
      internal::__fastpack2(in, out);
      break;
    case 3:
      internal::__fastpack3(in, out);
      break;
    case 4:
      internal::__fastpack4(in, out);
      break;
    case 5:
      internal::__fastpack5(in, out);
      break;
    case 6:
      internal::__fastpack6(in, out);
      break;
    case 7:
      internal::__fastpack7(in, out);
      break;
    case 8:
      internal::__fastpack8(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

// Note that this only packs 16 values
inline void fastunpack_half(
    const uint16_t* __restrict in,
    uint16_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastunpack0(in, out);
      break;
    case 1:
      internal::__fastunpack1(in, out);
      break;
    case 2:
      internal::__fastunpack2(in, out);
      break;
    case 3:
      internal::__fastunpack3(in, out);
      break;
    case 4:
      internal::__fastunpack4(in, out);
      break;
    case 5:
      internal::__fastunpack5(in, out);
      break;
    case 6:
      internal::__fastunpack6(in, out);
      break;
    case 7:
      internal::__fastunpack7(in, out);
      break;
    case 8:
      internal::__fastunpack8(in, out);
      break;
    case 9:
      internal::__fastunpack9(in, out);
      break;
    case 10:
      internal::__fastunpack10(in, out);
      break;
    case 11:
      internal::__fastunpack11(in, out);
      break;
    case 12:
      internal::__fastunpack12(in, out);
      break;
    case 13:
      internal::__fastunpack13(in, out);
      break;
    case 14:
      internal::__fastunpack14(in, out);
      break;
    case 15:
      internal::__fastunpack15(in, out);
      break;
    case 16:
      internal::__fastunpack16(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

// Note that this only packs 16 values
inline void fastpack_half(
    const uint16_t* __restrict in,
    uint16_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastpack0(in, out);
      break;
    case 1:
      internal::__fastpack1(in, out);
      break;
    case 2:
      internal::__fastpack2(in, out);
      break;
    case 3:
      internal::__fastpack3(in, out);
      break;
    case 4:
      internal::__fastpack4(in, out);
      break;
    case 5:
      internal::__fastpack5(in, out);
      break;
    case 6:
      internal::__fastpack6(in, out);
      break;
    case 7:
      internal::__fastpack7(in, out);
      break;
    case 8:
      internal::__fastpack8(in, out);
      break;
    case 9:
      internal::__fastpack9(in, out);
      break;
    case 10:
      internal::__fastpack10(in, out);
      break;
    case 11:
      internal::__fastpack11(in, out);
      break;
    case 12:
      internal::__fastpack12(in, out);
      break;
    case 13:
      internal::__fastpack13(in, out);
      break;
    case 14:
      internal::__fastpack14(in, out);
      break;
    case 15:
      internal::__fastpack15(in, out);
      break;
    case 16:
      internal::__fastpack16(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}
} // namespace internal

inline void fastunpack(
    const uint8_t* __restrict in,
    uint8_t* __restrict out,
    const uint32_t bit) {
  for (uint8_t i = 0; i < 4; i++) {
    internal::fastunpack_quarter(in + (i * bit), out + (i * 8), bit);
  }
}

inline void fastunpack(
    const uint16_t* __restrict in,
    uint16_t* __restrict out,
    const uint32_t bit) {
  internal::fastunpack_half(in, out, bit);
  internal::fastunpack_half(in + bit, out + 16, bit);
}

inline void fastunpack(
    const uint32_t* __restrict in,
    uint32_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastunpack0(in, out);
      break;
    case 1:
      internal::__fastunpack1(in, out);
      break;
    case 2:
      internal::__fastunpack2(in, out);
      break;
    case 3:
      internal::__fastunpack3(in, out);
      break;
    case 4:
      internal::__fastunpack4(in, out);
      break;
    case 5:
      internal::__fastunpack5(in, out);
      break;
    case 6:
      internal::__fastunpack6(in, out);
      break;
    case 7:
      internal::__fastunpack7(in, out);
      break;
    case 8:
      internal::__fastunpack8(in, out);
      break;
    case 9:
      internal::__fastunpack9(in, out);
      break;
    case 10:
      internal::__fastunpack10(in, out);
      break;
    case 11:
      internal::__fastunpack11(in, out);
      break;
    case 12:
      internal::__fastunpack12(in, out);
      break;
    case 13:
      internal::__fastunpack13(in, out);
      break;
    case 14:
      internal::__fastunpack14(in, out);
      break;
    case 15:
      internal::__fastunpack15(in, out);
      break;
    case 16:
      internal::__fastunpack16(in, out);
      break;
    case 17:
      internal::__fastunpack17(in, out);
      break;
    case 18:
      internal::__fastunpack18(in, out);
      break;
    case 19:
      internal::__fastunpack19(in, out);
      break;
    case 20:
      internal::__fastunpack20(in, out);
      break;
    case 21:
      internal::__fastunpack21(in, out);
      break;
    case 22:
      internal::__fastunpack22(in, out);
      break;
    case 23:
      internal::__fastunpack23(in, out);
      break;
    case 24:
      internal::__fastunpack24(in, out);
      break;
    case 25:
      internal::__fastunpack25(in, out);
      break;
    case 26:
      internal::__fastunpack26(in, out);
      break;
    case 27:
      internal::__fastunpack27(in, out);
      break;
    case 28:
      internal::__fastunpack28(in, out);
      break;
    case 29:
      internal::__fastunpack29(in, out);
      break;
    case 30:
      internal::__fastunpack30(in, out);
      break;
    case 31:
      internal::__fastunpack31(in, out);
      break;
    case 32:
      internal::__fastunpack32(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

inline void fastunpack(
    const uint32_t* __restrict in,
    uint64_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastunpack0(in, out);
      break;
    case 1:
      internal::__fastunpack1(in, out);
      break;
    case 2:
      internal::__fastunpack2(in, out);
      break;
    case 3:
      internal::__fastunpack3(in, out);
      break;
    case 4:
      internal::__fastunpack4(in, out);
      break;
    case 5:
      internal::__fastunpack5(in, out);
      break;
    case 6:
      internal::__fastunpack6(in, out);
      break;
    case 7:
      internal::__fastunpack7(in, out);
      break;
    case 8:
      internal::__fastunpack8(in, out);
      break;
    case 9:
      internal::__fastunpack9(in, out);
      break;
    case 10:
      internal::__fastunpack10(in, out);
      break;
    case 11:
      internal::__fastunpack11(in, out);
      break;
    case 12:
      internal::__fastunpack12(in, out);
      break;
    case 13:
      internal::__fastunpack13(in, out);
      break;
    case 14:
      internal::__fastunpack14(in, out);
      break;
    case 15:
      internal::__fastunpack15(in, out);
      break;
    case 16:
      internal::__fastunpack16(in, out);
      break;
    case 17:
      internal::__fastunpack17(in, out);
      break;
    case 18:
      internal::__fastunpack18(in, out);
      break;
    case 19:
      internal::__fastunpack19(in, out);
      break;
    case 20:
      internal::__fastunpack20(in, out);
      break;
    case 21:
      internal::__fastunpack21(in, out);
      break;
    case 22:
      internal::__fastunpack22(in, out);
      break;
    case 23:
      internal::__fastunpack23(in, out);
      break;
    case 24:
      internal::__fastunpack24(in, out);
      break;
    case 25:
      internal::__fastunpack25(in, out);
      break;
    case 26:
      internal::__fastunpack26(in, out);
      break;
    case 27:
      internal::__fastunpack27(in, out);
      break;
    case 28:
      internal::__fastunpack28(in, out);
      break;
    case 29:
      internal::__fastunpack29(in, out);
      break;
    case 30:
      internal::__fastunpack30(in, out);
      break;
    case 31:
      internal::__fastunpack31(in, out);
      break;
    case 32:
      internal::__fastunpack32(in, out);
      break;
    case 33:
      internal::__fastunpack33(in, out);
      break;
    case 34:
      internal::__fastunpack34(in, out);
      break;
    case 35:
      internal::__fastunpack35(in, out);
      break;
    case 36:
      internal::__fastunpack36(in, out);
      break;
    case 37:
      internal::__fastunpack37(in, out);
      break;
    case 38:
      internal::__fastunpack38(in, out);
      break;
    case 39:
      internal::__fastunpack39(in, out);
      break;
    case 40:
      internal::__fastunpack40(in, out);
      break;
    case 41:
      internal::__fastunpack41(in, out);
      break;
    case 42:
      internal::__fastunpack42(in, out);
      break;
    case 43:
      internal::__fastunpack43(in, out);
      break;
    case 44:
      internal::__fastunpack44(in, out);
      break;
    case 45:
      internal::__fastunpack45(in, out);
      break;
    case 46:
      internal::__fastunpack46(in, out);
      break;
    case 47:
      internal::__fastunpack47(in, out);
      break;
    case 48:
      internal::__fastunpack48(in, out);
      break;
    case 49:
      internal::__fastunpack49(in, out);
      break;
    case 50:
      internal::__fastunpack50(in, out);
      break;
    case 51:
      internal::__fastunpack51(in, out);
      break;
    case 52:
      internal::__fastunpack52(in, out);
      break;
    case 53:
      internal::__fastunpack53(in, out);
      break;
    case 54:
      internal::__fastunpack54(in, out);
      break;
    case 55:
      internal::__fastunpack55(in, out);
      break;
    case 56:
      internal::__fastunpack56(in, out);
      break;
    case 57:
      internal::__fastunpack57(in, out);
      break;
    case 58:
      internal::__fastunpack58(in, out);
      break;
    case 59:
      internal::__fastunpack59(in, out);
      break;
    case 60:
      internal::__fastunpack60(in, out);
      break;
    case 61:
      internal::__fastunpack61(in, out);
      break;
    case 62:
      internal::__fastunpack62(in, out);
      break;
    case 63:
      internal::__fastunpack63(in, out);
      break;
    case 64:
      internal::__fastunpack64(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

inline void fastpack(
    const uint8_t* __restrict in,
    uint8_t* __restrict out,
    const uint32_t bit) {
  for (uint8_t i = 0; i < 4; i++) {
    internal::fastpack_quarter(in + (i * 8), out + (i * bit), bit);
  }
}

inline void fastpack(
    const uint16_t* __restrict in,
    uint16_t* __restrict out,
    const uint32_t bit) {
  internal::fastpack_half(in, out, bit);
  internal::fastpack_half(in + 16, out + bit, bit);
}

inline void fastpack(
    const uint32_t* __restrict in,
    uint32_t* __restrict out,
    const uint32_t bit) {
  // Could have used function pointers instead of switch.
  // Switch calls do offer the compiler more opportunities for optimization in
  // theory. In this case, it makes no difference with a good compiler.
  switch (bit) {
    case 0:
      internal::__fastpack0(in, out);
      break;
    case 1:
      internal::__fastpack1(in, out);
      break;
    case 2:
      internal::__fastpack2(in, out);
      break;
    case 3:
      internal::__fastpack3(in, out);
      break;
    case 4:
      internal::__fastpack4(in, out);
      break;
    case 5:
      internal::__fastpack5(in, out);
      break;
    case 6:
      internal::__fastpack6(in, out);
      break;
    case 7:
      internal::__fastpack7(in, out);
      break;
    case 8:
      internal::__fastpack8(in, out);
      break;
    case 9:
      internal::__fastpack9(in, out);
      break;
    case 10:
      internal::__fastpack10(in, out);
      break;
    case 11:
      internal::__fastpack11(in, out);
      break;
    case 12:
      internal::__fastpack12(in, out);
      break;
    case 13:
      internal::__fastpack13(in, out);
      break;
    case 14:
      internal::__fastpack14(in, out);
      break;
    case 15:
      internal::__fastpack15(in, out);
      break;
    case 16:
      internal::__fastpack16(in, out);
      break;
    case 17:
      internal::__fastpack17(in, out);
      break;
    case 18:
      internal::__fastpack18(in, out);
      break;
    case 19:
      internal::__fastpack19(in, out);
      break;
    case 20:
      internal::__fastpack20(in, out);
      break;
    case 21:
      internal::__fastpack21(in, out);
      break;
    case 22:
      internal::__fastpack22(in, out);
      break;
    case 23:
      internal::__fastpack23(in, out);
      break;
    case 24:
      internal::__fastpack24(in, out);
      break;
    case 25:
      internal::__fastpack25(in, out);
      break;
    case 26:
      internal::__fastpack26(in, out);
      break;
    case 27:
      internal::__fastpack27(in, out);
      break;
    case 28:
      internal::__fastpack28(in, out);
      break;
    case 29:
      internal::__fastpack29(in, out);
      break;
    case 30:
      internal::__fastpack30(in, out);
      break;
    case 31:
      internal::__fastpack31(in, out);
      break;
    case 32:
      internal::__fastpack32(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}

inline void fastpack(
    const uint64_t* __restrict in,
    uint32_t* __restrict out,
    const uint32_t bit) {
  switch (bit) {
    case 0:
      internal::__fastpack0(in, out);
      break;
    case 1:
      internal::__fastpack1(in, out);
      break;
    case 2:
      internal::__fastpack2(in, out);
      break;
    case 3:
      internal::__fastpack3(in, out);
      break;
    case 4:
      internal::__fastpack4(in, out);
      break;
    case 5:
      internal::__fastpack5(in, out);
      break;
    case 6:
      internal::__fastpack6(in, out);
      break;
    case 7:
      internal::__fastpack7(in, out);
      break;
    case 8:
      internal::__fastpack8(in, out);
      break;
    case 9:
      internal::__fastpack9(in, out);
      break;
    case 10:
      internal::__fastpack10(in, out);
      break;
    case 11:
      internal::__fastpack11(in, out);
      break;
    case 12:
      internal::__fastpack12(in, out);
      break;
    case 13:
      internal::__fastpack13(in, out);
      break;
    case 14:
      internal::__fastpack14(in, out);
      break;
    case 15:
      internal::__fastpack15(in, out);
      break;
    case 16:
      internal::__fastpack16(in, out);
      break;
    case 17:
      internal::__fastpack17(in, out);
      break;
    case 18:
      internal::__fastpack18(in, out);
      break;
    case 19:
      internal::__fastpack19(in, out);
      break;
    case 20:
      internal::__fastpack20(in, out);
      break;
    case 21:
      internal::__fastpack21(in, out);
      break;
    case 22:
      internal::__fastpack22(in, out);
      break;
    case 23:
      internal::__fastpack23(in, out);
      break;
    case 24:
      internal::__fastpack24(in, out);
      break;
    case 25:
      internal::__fastpack25(in, out);
      break;
    case 26:
      internal::__fastpack26(in, out);
      break;
    case 27:
      internal::__fastpack27(in, out);
      break;
    case 28:
      internal::__fastpack28(in, out);
      break;
    case 29:
      internal::__fastpack29(in, out);
      break;
    case 30:
      internal::__fastpack30(in, out);
      break;
    case 31:
      internal::__fastpack31(in, out);
      break;
    case 32:
      internal::__fastpack32(in, out);
      break;
    case 33:
      internal::__fastpack33(in, out);
      break;
    case 34:
      internal::__fastpack34(in, out);
      break;
    case 35:
      internal::__fastpack35(in, out);
      break;
    case 36:
      internal::__fastpack36(in, out);
      break;
    case 37:
      internal::__fastpack37(in, out);
      break;
    case 38:
      internal::__fastpack38(in, out);
      break;
    case 39:
      internal::__fastpack39(in, out);
      break;
    case 40:
      internal::__fastpack40(in, out);
      break;
    case 41:
      internal::__fastpack41(in, out);
      break;
    case 42:
      internal::__fastpack42(in, out);
      break;
    case 43:
      internal::__fastpack43(in, out);
      break;
    case 44:
      internal::__fastpack44(in, out);
      break;
    case 45:
      internal::__fastpack45(in, out);
      break;
    case 46:
      internal::__fastpack46(in, out);
      break;
    case 47:
      internal::__fastpack47(in, out);
      break;
    case 48:
      internal::__fastpack48(in, out);
      break;
    case 49:
      internal::__fastpack49(in, out);
      break;
    case 50:
      internal::__fastpack50(in, out);
      break;
    case 51:
      internal::__fastpack51(in, out);
      break;
    case 52:
      internal::__fastpack52(in, out);
      break;
    case 53:
      internal::__fastpack53(in, out);
      break;
    case 54:
      internal::__fastpack54(in, out);
      break;
    case 55:
      internal::__fastpack55(in, out);
      break;
    case 56:
      internal::__fastpack56(in, out);
      break;
    case 57:
      internal::__fastpack57(in, out);
      break;
    case 58:
      internal::__fastpack58(in, out);
      break;
    case 59:
      internal::__fastpack59(in, out);
      break;
    case 60:
      internal::__fastpack60(in, out);
      break;
    case 61:
      internal::__fastpack61(in, out);
      break;
    case 62:
      internal::__fastpack62(in, out);
      break;
    case 63:
      internal::__fastpack63(in, out);
      break;
    case 64:
      internal::__fastpack64(in, out);
      break;
    default:
      throw std::logic_error("Invalid bit width for bitpacking");
  }
}
} // namespace pollux::fastpforlib
